/* * This file is part of the Heritrix web crawler (crawler.archive.org). * * Licensed to the Internet Archive (IA) by one or more individual * contributors. * * The IA licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.archive.crawler; import java.io.BufferedOutputStream; import java.io.ByteArrayInputStream; import java.io.File; import java.io.FileInputStream; import java.io.FileOutputStream; import java.io.IOException; import java.io.InputStream; import java.io.PrintStream; import java.lang.management.ManagementFactory; import java.net.InetAddress; import java.net.UnknownHostException; import java.security.KeyStore; import java.security.MessageDigest; import java.security.cert.Certificate; import java.util.ArrayList; import java.util.Arrays; import java.util.HashSet; import java.util.List; import java.util.Set; import java.util.TimeZone; import java.util.logging.LogManager; import java.util.logging.Logger; import org.apache.commons.cli.CommandLine; import org.apache.commons.cli.CommandLineParser; import org.apache.commons.cli.GnuParser; import org.apache.commons.cli.HelpFormatter; import org.apache.commons.cli.Options; import org.apache.commons.cli.ParseException; import org.apache.commons.io.FileUtils; import org.apache.commons.io.output.TeeOutputStream; import org.apache.commons.lang.StringUtils; import org.archive.crawler.framework.Engine; import org.archive.crawler.restlet.EngineApplication; import org.archive.crawler.restlet.RateLimitGuard; import org.archive.util.ArchiveUtils; import org.archive.util.KeyTool; import org.restlet.Component; import org.restlet.Guard; import org.restlet.Server; import org.restlet.data.ChallengeScheme; import org.restlet.data.Protocol; /** * Main class for Heritrix crawler. * * Heritrix is usually launched by a shell script that backgrounds heritrix * that redirects all stdout and stderr emitted by heritrix to a log file. So * that startup messages emitted subsequent to the redirection of stdout and * stderr show on the console, this class prints usage or startup output * such as where the web UI can be found, etc., to a STARTLOG that the shell * script is waiting on. As soon as the shell script sees output in this file, * it prints its content and breaks out of its wait. * See ${HERITRIX_HOME}/bin/heritrix. * * <p>Heritrix can also be embedded or launched by webapp initialization or * by JMX bootstrapping. So far I count 4 methods of instantiation: * <ol> * <li>From this classes main -- the method usually used;</li> * <li>From the Heritrix UI (The local-instances.jsp) page;</li> * <li>A creation by a JMX agent at the behest of a remote JMX client; and</li> * <li>A container such as tomcat or jboss.</li> * </ol> * * @author gojomo * @author Kristinn Sigurdsson * @author Stack */ public class Heritrix { private static final String ADHOC_PASSWORD = "password"; private static final String ADHOC_KEYSTORE = "adhoc.keystore"; private static final Logger logger = Logger.getLogger(Heritrix.class.getName()); /** Name of configuration directory */ private static final String CONF = "conf"; /** Name of the heritrix properties file */ private static final String PROPERTIES = "logging.properties"; protected Engine engine; protected Component component; /** * Heritrix start log file. * * This file contains standard out produced by this main class for startup * only. Used by heritrix shell script. Name here MUST match that in the * <code>bin/heritrix</code> shell script. This is a DEPENDENCY the shell * wrapper has on this here java heritrix. */ private static final String STARTLOG = "heritrix_dmesg.log"; private static void usage(PrintStream out, String[] args) { HelpFormatter hf = new HelpFormatter(); hf.printHelp("Heritrix", options()); out.println("Your arguments were: "+StringUtils.join(args, ' ')); } private static Options options() { Options options = new Options(); options.addOption("h", "help", true, "Usage information." ); options.addOption("a", "web-admin", true, "REQUIRED. Specifies the " + "authorization username and password which must be supplied to " + "access the web interface. This may be of the form " + "\"password\" (which leaves username as the default 'admin'), " + "\"username:password\", or \"@filename\" for a file that " + "includes the single line \"username:password\". "); options.addOption("j", "jobs-dir", true, "The jobs directory. " + "Defaults to ./jobs"); options.addOption("l", "logging-properties", true, "The full path to the logging properties file " + "(eg, conf/logging.properties). If present, this file " + "will be used to configure Java logging. Defaults to " + "${heritrix.home}/conf/logging.properties or if no " + "heritrix.home property set, ./conf/logging.properties"); options.addOption("b", "web-bind-hosts", true, "A comma-separated list of addresses/hostnames for the " + "web interface to bind to."); options.addOption("p", "web-port", true, "The port the web interface " + "should listen on."); options.addOption("s", "ssl-params", true, "Specify a keystore " + "path, keystore password, and key password for HTTPS use. " + "Separate with commas, no whitespace."); return options; } private static File getDefaultPropertiesFile() { File confDir = new File(getHeritrixHome(),CONF); File props = new File(confDir, PROPERTIES); return props; } private static CommandLine getCommandLine(PrintStream out, String[] args) { CommandLineParser clp = new GnuParser(); CommandLine cl; try { cl = clp.parse(options(), args); } catch (ParseException e) { usage(out, args); return null; } if (cl.getArgList().size() != 0) { usage(out, args); return null; } return cl; } /** * Launches a local Engine and restfgul web interface given the * command-line options or defaults. * * @param args Command line arguments. * @throws Exception */ public static void main(String[] args) throws Exception { new Heritrix().instanceMain(args); } public void instanceMain(String[] args) throws Exception { System.out.println(System.getProperty("java.vendor") + ' ' + System.getProperty("java.runtime.name") + ' ' + System.getProperty("java.runtime.version")); // ensure using java 1.6+ before hitting a later cryptic error String version = System.getProperty("java.version"); float floatVersion = Float.valueOf(version.substring(0,version.indexOf('.',2))); if(floatVersion<1.6) { System.err.println("Heritrix (as of version 3) requires Java 1.6 or higher."); System.err.println("You attempted to launch with: "+version); System.err.println("Please try again with a later Java."); System.exit(1); } // Set some system properties early. // Can't use class names here without loading them. String ignoredSchemes = "org.archive.net.UURIFactory.ignored-schemes"; if (System.getProperty(ignoredSchemes) == null) { System.setProperty(ignoredSchemes, "mailto, clsid, res, file, rtsp, about"); } String maxFormSize = "org.mortbay.jetty.Request.maxFormContentSize"; if (System.getProperty(maxFormSize) == null) { System.setProperty(maxFormSize, "52428800"); } BufferedOutputStream startupOutStream = new BufferedOutputStream( new FileOutputStream( new File(getHeritrixHome(), STARTLOG)),16384); PrintStream startupOut = new PrintStream( new TeeOutputStream( System.out, startupOutStream)); CommandLine cl = getCommandLine(startupOut, args); if (cl == null) return; if (cl.hasOption('h')) { usage(startupOut, args); return ; } // DEFAULTS until changed by cmd-line options int port = 8443; Set<String> bindHosts = new HashSet<String>(); String authLogin = "admin"; String authPassword = null; String keystorePath; String keystorePassword; String keyPassword; File properties = getDefaultPropertiesFile(); String aOption = cl.getOptionValue('a'); if (cl.hasOption('a')) { String usernameColonPassword = aOption; try { if(aOption.startsWith("@")) { usernameColonPassword = FileUtils.readFileToString(new File(aOption.substring(1))).trim(); } int colonIndex = usernameColonPassword.indexOf(':'); if(colonIndex>-1) { authLogin = usernameColonPassword.substring(0,colonIndex); authPassword = usernameColonPassword.substring(colonIndex+1); } else { authPassword = usernameColonPassword; } } catch (IOException e) { // only if @filename read had problems System.err.println("Unable to read [username:]password from "+aOption); } } if(authPassword==null) { System.err.println( "You must specify a valid [username:]password for the web interface using -a." ); System.exit(1); authPassword = ""; // suppresses uninitialized warning } File jobsDir = null; if (cl.hasOption('j')) { jobsDir = new File(cl.getOptionValue('j')); } else { jobsDir = new File("./jobs"); } if (cl.hasOption('l')) { properties = new File(cl.getOptionValue('l')); } if (cl.hasOption('b')) { String hosts = cl.getOptionValue('b'); List<String> list; if("/".equals(hosts)) { // '/' means all, signified by empty-list list = new ArrayList<String>(); } else { list = Arrays.asList(hosts.split(",")); } bindHosts.addAll(list); } else { // default: only localhost bindHosts.add("localhost"); } if (cl.hasOption('p')) { port = Integer.parseInt(cl.getOptionValue('p')); } // SSL options (possibly none, in which case adhoc keystore // is created or reused if(cl.hasOption('s')) { String[] sslParams = cl.getOptionValue('s').split(","); keystorePath = sslParams[0]; keystorePassword = sslParams[1]; keyPassword = sslParams[2]; } else { // use ad hoc keystore, creating if necessary keystorePath = ADHOC_KEYSTORE; keystorePassword = ADHOC_PASSWORD; keyPassword = ADHOC_PASSWORD; useAdhocKeystore(startupOut); } if (properties.exists()) { FileInputStream finp = new FileInputStream(properties); LogManager.getLogManager().readConfiguration(finp); finp.close(); } // Set timezone here. Would be problematic doing it if we're running // inside in a container. TimeZone.setDefault(TimeZone.getTimeZone("GMT")); setupGlobalProperties(port); // Start Heritrix. try { engine = new Engine(jobsDir); component = new Component(); if(bindHosts.isEmpty()) { // listen all addresses setupServer(port, null, keystorePath, keystorePassword, keyPassword); } else { // bind only to declared addresses, or just 'localhost' for(String address : bindHosts) { setupServer(port, address, keystorePath, keystorePassword, keyPassword); } } component.getClients().add(Protocol.FILE); component.getClients().add(Protocol.CLAP); Guard guard = new RateLimitGuard(null, ChallengeScheme.HTTP_DIGEST, "Authentication Required"); guard.getSecrets().put(authLogin, authPassword.toCharArray()); component.getDefaultHost().attach(guard); guard.setNext(new EngineApplication(engine)); component.start(); startupOut.println("engine listening at port "+port); startupOut.println("operator login set per " + ((aOption.startsWith("@")) ? "file "+aOption : "command-line")); if(authPassword.length()<8 || authPassword.matches("[a-zA-Z]{0,10}") ||authPassword.matches("\\d{0,10}")) { startupOut.println( "NOTE: We recommend a longer, stronger password, especially if your web \n" + "interface will be internet-accessible."); } if (cl.hasOption('r')) { engine.requestLaunch(cl.getOptionValue('r')); } } catch (Exception e) { // Show any exceptions in STARTLOG. e.printStackTrace(startupOut); if (component != null) { component.stop(); } throw e; } finally { startupOut.flush(); // stop writing to side startup file startupOutStream.close(); System.out.println("Heritrix version: " + ArchiveUtils.VERSION); } } /** * Setup global system properties that may be of use elsewhere. * * @param port */ protected void setupGlobalProperties(int port) { if (System.getProperty("heritrix.port") == null) { System.setProperty("heritrix.port", port + ""); } String hostname = "localhost.localdomain"; if (System.getProperty("heritrix.hostname") == null) { try { hostname = InetAddress.getLocalHost().getCanonicalHostName(); } catch (UnknownHostException ue) { logger.warning("Failed getHostAddress for this host: " + ue); } System.setProperty("heritrix.hostname", hostname); } // while not guaranteed, on our platforms of interest this name // always seems to be PID@HOSTNAME String runtimeName = ManagementFactory.getRuntimeMXBean().getName(); if(System.getProperty("heritrix.runtimeName") == null) { System.setProperty("heritrix.runtimeName", runtimeName); } if (System.getProperty("heritrix.pid") == null && runtimeName.matches("\\d+@\\S+")) { System.setProperty("heritrix.pid", runtimeName.substring(0,runtimeName.indexOf("@"))); } } /** * Perform preparation to use an ad-hoc, created-as-necessary * certificate/keystore for HTTPS access. A keystore with new * cert is created if necessary, as adhoc.keystore in the working * directory. Otherwise, a preexisting adhoc.keystore is read * and the certificate fingerprint shown to assist in operator * browser-side verification. * @param startupOut where to report fingerprint */ protected void useAdhocKeystore(PrintStream startupOut) { try { File keystoreFile = new File(ADHOC_KEYSTORE); if(!keystoreFile.exists()) { String[] args = { "-keystore",ADHOC_KEYSTORE, "-storepass",ADHOC_PASSWORD, "-keypass",ADHOC_PASSWORD, "-alias","adhoc", "-genkey","-keyalg","RSA", "-dname", "CN=Heritrix Ad-Hoc HTTPS Certificate", "-validity","3650"}; // 10 yr validity KeyTool.main(args); } KeyStore keystore = KeyStore.getInstance(KeyStore.getDefaultType()); InputStream inStream = new ByteArrayInputStream( FileUtils.readFileToByteArray(keystoreFile)); keystore.load(inStream, ADHOC_PASSWORD.toCharArray()); Certificate cert = keystore.getCertificate("adhoc"); byte[] certBytes = cert.getEncoded(); byte[] sha1 = MessageDigest.getInstance("SHA1").digest(certBytes); startupOut.print("Using ad-hoc HTTPS certificate with fingerprint...\nSHA1"); for(byte b : sha1) { startupOut.print(String.format(":%02X",b)); } startupOut.println("\nVerify in browser before accepting exception."); } catch (Exception e) { // fatal, rethrow throw new RuntimeException(e); } } /** * Create an HTTPS restlet Server instance matching the given parameters. * * @param port * @param address * @param keystorePath * @param keystorePassword * @param keyPassword */ protected void setupServer(int port, String address, String keystorePath, String keystorePassword, String keyPassword) { Server server = new Server(Protocol.HTTPS,address,port,null); component.getServers().add(server); server.getContext().getParameters().add("keystorePath", keystorePath); server.getContext().getParameters().add("keystorePassword", keystorePassword); server.getContext().getParameters().add("keyPassword", keyPassword); } /** * Exploit <code>-Dheritrix.home</code> if available to us. * Is current working dir if no heritrix.home property supplied. * @return Heritrix home directory. * @throws IOException */ protected static File getHeritrixHome() { String home = System.getProperty("heritrix.home"); if (home != null && home.length() > 0) { File candidate = new File(home); if (candidate.exists()) { return candidate; } logger.warning("HERITRIX_HOME <" + home +"> does not exist; " + "using current working directory instead."); } return new File(System.getProperty("user.dir")); } public Engine getEngine() { return engine; } public Component getComponent() { return component; } }